In [126]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import itertools
import spacy
import nltk
%matplotlib inline
In [127]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
In [128]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
In [129]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
In [130]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
In [131]:
# dataframe display options
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 200)
After fine tuning the parameters of each of the models it became clear that the performance of SVM is superior (F1=0.64) as compared to LR (F1=0.62) and MNB(F1=0.62). Nonetheless, the improvement of performance comes at the cost of interpretability of feature importance and a high complexity of parameters.
I will use Logistic Regression and use top k predictions to boost the accuracy of the model.
In [268]:
def multiplek(y_test, topk):
multik = []
for i, score in enumerate(y_test):
if y_test[i] in topk[i]:
multik.append(y_test[i])
else:
multik.append(topk[i][0])
return multik
Top 3 categories are the top 3 categories with the highest probability.
In [121]:
pred = grid.predict(X_test.ravel()) # predicts a category
p_prob = grid.predict_proba(X_test.ravel()) # predicts the probabilities of each category
In [85]:
idxs = np.argsort(p_prob[0])[::-1] # sorts probabilities in descending order
In [88]:
print(grid.best_estimator_.named_steps["lr"].classes_[idxs][:3])
The function below brings all the steps above so it can be applied to the X_test set.
In [91]:
def topk_scored(model,step_name X_test, k):
results = []
p_prob = model.predict_proba(X_test)
for pred in p_prob:
idxs = np.argsort(pred)[::-1]
results.append(model.best_estimator_.named_steps[step_name].classes_[idxs][:k])
return results
In [93]:
pps = topk_scored(grid,"lr", X_test.ravel(), 3)
The function below marks the result as positive if the correct category is within topk.
In [95]:
def scoring_multiplek(y_test, topk):
result = 0
l = len(y_test)
for i, score in enumerate(y_test):
if y_test[i] in topk[i]:
result += 1
return result/l
In [120]:
adjusted_score = scoring_multiplek(y_test.values, pps)
adjusted_score
Out[120]:
By selecting the top 3 categories, the accuracy score using Logistic Regression goes from %64 to 86%.
In [99]:
X_test[:3]
Out[99]:
In [100]:
y_test[:3]
Out[100]:
In [101]:
pps[:3]
Out[101]:
In [116]:
# Dataframe to review 3-category predictions
cont = list(zip(X_test,y_test.values, pps))
cols = ["Input", "Input Category", "Predicted Categories"]
results_df = pd.DataFrame(cont ,columns=cols)
In [117]:
results_df.head()
Out[117]:
In [118]:
results_df.tail()
Out[118]:
In [119]:
sc = scoring_multiplek(y_test.values, pps)
sc
Out[119]:
The implementation of Logistic Regression with top 3 categories would look like the mockup below:
In [111]:
x = "new on twitter - shots of the keynote speakers, the panel members and the audiences reactions, as well as while people network. posed are fine but candid photos are preferred. the photographer will need to check in on the first floor with building security and say s/he is visiting twitter. security will send s/he to the 9th floor where they will check in with reception."
In [168]:
example = grid.predict_proba(x)
ex_idxs = np.argsort(example[0])[::-1]
print(grid.best_estimator_.named_steps["lr"].classes_[ex_idxs][:3])
The correct category as in the dataset is "company event"